In [1]:
import pandas as pd
import numpy as np 
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.plotly as py
from plotly import tools
from datetime import date
import seaborn as sns
import random 
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
import zipfile
import os
import winsound
In [2]:
import sys
In [3]:
sys.path.append('c:\\users\\chait\\anaconda3\\lib\\site-packages')
In [4]:
"""Helper Functions to do Plotting"""
def generateLayoutBar(col_name):
    """
    Generate a layout object for bar chart
    """
    layout_bar = go.Layout(
        autosize=False,  # auto size the graph? use False if you are specifying the height and width
        width=800,  # height of the figure in pixels
        height=600,  # height of the figure in pixels
        title="Distribution of {} column".format(col_name),  # title of the figure
        # more granular control on the title font
        titlefont=dict(
            family='Courier New, monospace',  # font family
            size=14,  # size of the font
            color='black'  # color of the font
        ),
        # granular control on the axes objects
        xaxis=dict(
            tickfont=dict(
                family='Courier New, monospace',  # font family
                size=14,  # size of ticks displayed on the x axis
                color='black'  # color of the font
            )
        ),
        yaxis=dict(
            #         range=[0,100],
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
            tickfont=dict(
                family='Courier New, monospace',  # font family
                size=14,  # size of ticks displayed on the y axis
                color='black'  # color of the font
            )
        ),
        font=dict(
            family='Courier New, monospace',  # font family
            color="white",  # color of the font
            size=12  # size of the font displayed on the bar
        )
    )
    return layout_bar


def plotBar(dataframe_name, col_name, top_n=None):
    """
    Plot a bar chart for the categorical columns

    Arguments:
    dataframe name
    categorical column name

    Output:
    Plot
    """
    # create a table with value counts
    temp = dataframe_name[col_name].value_counts()
    if top_n is not None:
        temp = temp.head(top_n)
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=temp.index.astype(str),  # x axis values
            y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100,  # y axis values
            text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
            # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition='auto',  # specify at which position on the bar the text should appear
            marker=dict(color='#0047AB'),)]  # change color of the bar
    # color used here Cobalt Blue

    layout_bar = generateLayoutBar(col_name=col_name)

    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)
In [5]:
outpath = "../data/"
In [6]:
os.listdir(outpath)
Out[6]:
['20ba1792-d-Data.zip', 'Data']
In [7]:
# for i in os.listdir(outpath):
#     z = zipfile.ZipFile(outpath + i)
#     z.extractall(path = outpath)
#     z.close()
In [8]:
for i in os.listdir(outpath):
    print(i)
20ba1792-d-Data.zip
Data
In [9]:
datapath = '../data/Data/'
In [10]:
for i in os.listdir(datapath):
    print(i)
Customer_Demographics.xlsx
Customer_Transaction.xlsx
RFM.csv
Store_Master.xlsx
Test_Set.xlsx
In [11]:
Customer_Demographics = pd.read_excel(datapath + 'Customer_Demographics.xlsx')
In [12]:
Customer_Transaction = pd.read_excel(datapath + 'Customer_Transaction.xlsx')
In [13]:
Store_Master = pd.read_excel(datapath + 'Store_Master.xlsx')
In [14]:
Test_Set = pd.read_excel(datapath + 'Test_Set.xlsx')
In [15]:
[i.shape for i in [Customer_Demographics, Customer_Transaction, Store_Master, Test_Set]]
Out[15]:
[(100000, 16), (544649, 15), (35, 15), (400000, 2)]

No information about the units given in the data. Assuming Revenue in dhirams.

Customer_Demographics

In [16]:
Customer_Demographics.head()
Out[16]:
Customer_ID Territory Nationality Income_Range Job_Type Marital_Status Gender State Language Loyalty_Status Birth_date Age Points First_txn_dt Last_accr_txn_dt Last_rdm_txn_dt
0 1800000006365760 United Arab Emirates INDIA Below 5000 Services Married M Abu Dhabi English Gold 01JUN1959:00:00:00 59.0 814.0 02NOV2010:00:00:00 27SEP2018:21:45:25 11MAR2018:19:11:57
1 1800000006365820 United Arab Emirates UNITED ARAB EMIRATES Below 5000 Services Married F Abu Dhabi Arabic Silver 01JAN1900:00:00:00 NaN 212.0 06DEC2010:00:00:00 16AUG2016:21:08:28 16MAR2015:13:39:12
2 1800000006366060 United Arab Emirates SRI LANKA Below 5000 Services Married M Abu Dhabi English Silver 07JUN1959:00:00:00 59.0 186.0 17FEB2011:00:00:00 31AUG2018:20:52:18 30JAN2017:19:19:19
3 1800000006366230 United Arab Emirates JORDAN Below 5000 Services Married F Abu Dhabi Arabic Silver 21JAN1962:00:00:00 57.0 126.0 30OCT2010:00:00:00 30JUL2018:20:30:05 20JUN2017:17:57:27
4 1800000006366570 United Arab Emirates OCCUPIED PALESTINIAN TERRITORY Below 5000 Services Married M Abu Dhabi Arabic Silver 01JAN1900:00:00:00 NaN 197.0 10NOV2010:00:00:00 03SEP2018:22:10:56 17JUN2017:23:08:28
In [17]:
Customer_Demographics.dtypes
Out[17]:
Customer_ID           int64
Territory            object
Nationality          object
Income_Range         object
Job_Type             object
Marital_Status       object
Gender               object
State                object
Language             object
Loyalty_Status       object
Birth_date           object
Age                 float64
Points              float64
First_txn_dt         object
Last_accr_txn_dt     object
Last_rdm_txn_dt      object
dtype: object
In [18]:
from datetime import datetime
In [19]:
datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
Out[19]:
datetime.datetime(2005, 6, 1, 13, 33)
In [20]:
Customer_Demographics.First_txn_dt[0]
Out[20]:
'02NOV2010:00:00:00'
In [21]:
t = datetime.strptime('02NOV2010:00:00:00', '%d%b%Y:%H:%M:%S')
In [22]:
date_cols_demo = [ i for i in Customer_Demographics.columns if (i.endswith('dt')) or ('date' in i)]
In [23]:
date_cols_demo
Out[23]:
['Birth_date', 'First_txn_dt', 'Last_accr_txn_dt', 'Last_rdm_txn_dt']
In [24]:
Customer_Demographics.First_txn_dt.dtype
Out[24]:
dtype('O')
In [25]:
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].astype('O')
In [26]:
def convertToDate(x):
    try:
        return datetime.strptime(x, '%d%b%Y:%H:%M:%S')
    except:
        return x
In [27]:
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].applymap(lambda x : convertToDate(x))
Feature engineering
In [28]:
date_cols_demo
Out[28]:
['Birth_date', 'First_txn_dt', 'Last_accr_txn_dt', 'Last_rdm_txn_dt']
In [29]:
def extractColTypes(dataset):
    """This functions extracts numeric, categorical , datetime and boolean column types.
    Returns 4 lists with respective column types"""
    num_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['int64','float64']]
    cat_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['object']]
    date_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['datetime64[ns]']]
    bool_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['bool']]
    print ("Numeric Columns:", len(num_cols_list))
    print ("Categorical/Character Columns:", len(cat_cols_list))
    print ("Date Columns:",len(date_cols_list))
    print ("Boolean Columns:",len(bool_cols_list))
    return(num_cols_list,cat_cols_list,date_cols_list,bool_cols_list)
In [30]:
demo_num_cols_list,demo_cat_cols_list,demo_date_cols_list,demo_bool_cols_list = extractColTypes(Customer_Demographics)
Numeric Columns: 3
Categorical/Character Columns: 9
Date Columns: 4
Boolean Columns: 0
In [31]:
demo_cat_cols_list
Out[31]:
['Territory',
 'Nationality',
 'Income_Range',
 'Job_Type',
 'Marital_Status',
 'Gender',
 'State',
 'Language',
 'Loyalty_Status']
Days since first transaction
In [32]:
today = convertToDate('24NOV2018:00:00:00')
In [33]:
def getDaysDiff(x):
    t = -1*(x - today)
    return t.apply(lambda x : x.days)
In [34]:
Customer_Demographics['custSince'] = getDaysDiff(Customer_Demographics.First_txn_dt)
days since last accr and rdm transactions
In [35]:
Customer_Demographics['ds_accr']= getDaysDiff(Customer_Demographics.Last_accr_txn_dt)
In [36]:
Customer_Demographics['ds_rdm']= getDaysDiff(Customer_Demographics.Last_rdm_txn_dt)

Distribution of the categorical columns

In [37]:
for i in demo_cat_cols_list[1:]:
    plotBar(Customer_Demographics, i,top_n=10)

Some columns here like the Income and Marital status are unspecified, as of now we cannot decide if they will help in the prediction, but we should keep a close watch for these.

In [38]:
demo_num_cols_list
Out[38]:
['Customer_ID', 'Age', 'Points']
In [39]:
def plotHist(dataframe, col_name):
    data = [go.Histogram(x=dataframe[col_name],
       marker=dict(
        color='#CC0E1D',# Lava (#CC0E1D)
#         color = 'rgb(200,0,0)'   # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
    ))]
    layout = go.Layout(title = "Histogram of {}".format(col_name))
    fig = go.Figure(data= data, layout=layout)
    return iplot(fig)
In [40]:
plotHist(Customer_Demographics, 'Age')

Looks like the age group of 30-40 are frequent visitors of the store, also we see some entries above 100, I will remove them

In [41]:
Customer_Demographics.Age = Customer_Demographics.Age.clip_upper(100)
In [42]:
plotHist(Customer_Demographics, 'Age')
In [43]:
plotHist(Customer_Demographics, 'Points')

Most people have 0-2000 points

In [44]:
Customer_Demographics[Customer_Demographics.Points>2000].shape
Out[44]:
(1049, 19)

Just above 1000 customers have more than 2000 points

Let us see if these customers are distinguishable from the rest

In [45]:
for i in demo_cat_cols_list[1:]:
    plotBar(Customer_Demographics[Customer_Demographics.Points>2000], i,top_n=10)

The distribution of all the above columns look very similar to those of the unfiltered data. There is no disctinguishing factor as such.

Does loyalty status have correlation with the number of days a customer was with the brand

In [46]:
def plotMultiBox(dataframe,col_name, num_col_name):
    data = []
    for i in dataframe[col_name].unique():
        trace = go.Box(y = dataframe[num_col_name][dataframe[col_name] == i],
                      name = i)
        data.append(trace)
    layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name,num_col_name))
    fig = go.Figure(data=data, layout=layout)
    return (iplot(fig))
In [47]:
plotMultiBox(Customer_Demographics, 'Loyalty_Status', 'custSince')

We see that in general, gold customers are with the company longer

In [48]:
plotMultiBox(Customer_Demographics, 'Gender', 'custSince')

Females have been customers for a marginally more number of days than males.

In [49]:
plotMultiBox(Customer_Demographics, 'Income_Range', 'custSince')

Observations:

  • An interesting observation here, customers who earn more than 20,000 Dhirams have been with the company for over 2500 days, also there are no recent entries for such high income groups.
  • The recent entries are from <20000 dhiram earning group. May be the store now has more affordable goods.
  • This could also be a result of the location of the malls

Customer_Transaction

In [50]:
Customer_Transaction.columns
Out[50]:
Index(['Territory', 'Business', 'Year', 'Week', 'Store_Code', 'City_Name',
       'Store_Type', 'Transaction_Type', 'Return_Reason', 'Customer_ID',
       'Invoices', 'Item_Count', 'Revenue', 'Discount', 'Units_Sold'],
      dtype='object')
In [51]:
Customer_Transaction.dtypes
Out[51]:
Territory            object
Business             object
Year                  int64
Week                  int64
Store_Code            int64
City_Name            object
Store_Type           object
Transaction_Type     object
Return_Reason        object
Customer_ID           int64
Invoices              int64
Item_Count            int64
Revenue             float64
Discount            float64
Units_Sold            int64
dtype: object
In [52]:
Customer_Transaction.head(20)
Out[52]:
Territory Business Year Week Store_Code City_Name Store_Type Transaction_Type Return_Reason Customer_ID Invoices Item_Count Revenue Discount Units_Sold
0 United Arab Emirates Max 2010 0 60065 Dubai Stand alone Return Size Problem 1800000058056860 1 3 -169.0 0.0 -3
1 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000020917140 1 1 0.0 0.0 0
2 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000021265010 1 1 0.0 0.0 0
3 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000025693950 1 4 0.0 0.0 0
4 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000044104620 1 1 0.0 0.0 0
5 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000045703150 1 1 0.0 0.0 0
6 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000045740150 1 1 0.0 0.0 0
7 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000046079170 2 7 0.0 0.0 0
8 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000047493340 1 1 0.0 0.0 0
9 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000048130990 1 2 0.0 0.0 0
10 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000054019360 1 1 0.0 0.0 0
11 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000054841260 1 1 0.0 0.0 0
12 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000054994790 1 1 0.0 0.0 0
13 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000058101370 1 1 0.0 0.0 0
14 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000058661850 1 1 0.0 0.0 0
15 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000080131780 1 1 0.0 0.0 0
16 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000080464400 1 2 0.0 0.0 0
17 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000089762380 1 1 0.0 0.0 0
18 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000117190630 1 1 0.0 0.0 0
19 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000117238230 1 1 0.0 0.0 0
In [53]:
plotHist(Customer_Transaction,'Revenue')
In [54]:
Customer_Transaction.Customer_ID = Customer_Transaction.Customer_ID.astype('O')
In [55]:
tran_num_cols_list, tran_cat_cols_list, tran_date_cols_list, tran_bool_cols_list = extractColTypes(Customer_Transaction)
Numeric Columns: 8
Categorical/Character Columns: 7
Date Columns: 0
Boolean Columns: 0
In [56]:
tran_cat_cols_list.remove('Customer_ID')
In [57]:
for i in tran_cat_cols_list:
    plotBar(Customer_Transaction, i,top_n=10)

Observations:

  • 7% return rate in the shops
  • Most returns are related to Size Problems
  • Store type, business and delivery have only one value

The data here helps us do the RFM analysis of a customer

Creating a new column recent_tran_date in Customer_Demographics
In [58]:
 Customer_Demographics[['Last_accr_txn_dt', 'Last_rdm_txn_dt']].isnull().sum()
Out[58]:
Last_accr_txn_dt        6
Last_rdm_txn_dt     28419
dtype: int64
In [59]:
Customer_Demographics['recent_tran_date'] = Customer_Demographics[['Last_accr_txn_dt', 'Last_rdm_txn_dt']].apply(max, axis = 1)
In [60]:
Customer_Demographics[['Last_accr_txn_dt', 'Last_rdm_txn_dt','recent_tran_date']].head()
Out[60]:
Last_accr_txn_dt Last_rdm_txn_dt recent_tran_date
0 2018-09-27 21:45:25 2018-03-11 19:11:57 2018-09-27 21:45:25
1 2016-08-16 21:08:28 2015-03-16 13:39:12 2016-08-16 21:08:28
2 2018-08-31 20:52:18 2017-01-30 19:19:19 2018-08-31 20:52:18
3 2018-07-30 20:30:05 2017-06-20 17:57:27 2018-07-30 20:30:05
4 2018-09-03 22:10:56 2017-06-17 23:08:28 2018-09-03 22:10:56
In [61]:
cust_recent_tran_date = Customer_Demographics[['Customer_ID','recent_tran_date']]
In [63]:
cust_recent_tran_date.isnull().sum()
Out[63]:
Customer_ID         0
recent_tran_date    6
dtype: int64

Recency

In [64]:
# same functions used before
def getRecency(x):
    t = -1*(x - today)
    return t.apply(lambda x : x.days)
In [65]:
custId_Recency = pd.DataFrame(cust_recent_tran_date['Customer_ID'])
In [66]:
custId_Recency['Recency'] = getDaysDiff(cust_recent_tran_date.recent_tran_date)
In [67]:
custId_Recency.shape
Out[67]:
(100000, 2)
In [68]:
custId_Recency.head()
Out[68]:
Customer_ID Recency
0 1800000006365760 57.0
1 1800000006365820 829.0
2 1800000006366060 84.0
3 1800000006366230 116.0
4 1800000006366570 81.0

Frequency

Considering the data in Customer Transaction table, we have the returns data as well, we will consider return as -1 invoice or -revenue

In [69]:
Customer_Transaction.columns
Out[69]:
Index(['Territory', 'Business', 'Year', 'Week', 'Store_Code', 'City_Name',
       'Store_Type', 'Transaction_Type', 'Return_Reason', 'Customer_ID',
       'Invoices', 'Item_Count', 'Revenue', 'Discount', 'Units_Sold'],
      dtype='object')
In [70]:
Customer_Transaction.Transaction_Type.unique()
Out[70]:
array(['Return', 'Purchase'], dtype=object)
In [71]:
# Customer_Transaction[Customer_Transaction.Transaction_Type != 'Return'].shape
In [72]:
Customer_Transaction[Customer_Transaction.Transaction_Type == 'Return'].Invoices = -1 * Customer_Transaction[Customer_Transaction.Transaction_Type == 'Return'].Invoices
In [73]:
custId_Frequency = Customer_Transaction.groupby(['Customer_ID']).agg({'Invoices': np.sum}).reset_index()
In [74]:
custId_Frequency.columns = ['Customer_ID', 'Frequency']
In [75]:
custId_Frequency.shape
Out[75]:
(100000, 2)
In [76]:
custId_Frequency.head()
Out[76]:
Customer_ID Frequency
0 1800000006365760 5
1 1800000006365820 1
2 1800000006366060 7
3 1800000006366230 6
4 1800000006366570 11
In [77]:
custId_Frequency.Frequency[custId_Frequency.Frequency<0]
Out[77]:
Series([], Name: Frequency, dtype: int64)

Monetary Value

In [78]:
custId_Monetary = Customer_Transaction.groupby(['Customer_ID']).agg({'Revenue': np.sum}).reset_index()
In [79]:
custId_Monetary.columns = ['Customer_ID','MonetaryValue']
In [80]:
custId_Monetary.shape
Out[80]:
(100000, 2)
In [81]:
custId_Monetary.head()
Out[81]:
Customer_ID MonetaryValue
0 1800000006365760 1117.00
1 1800000006365820 260.00
2 1800000006366060 607.00
3 1800000006366230 643.80
4 1800000006366570 1285.01

combining R F M

In [82]:
custId_RFM = custId_Recency.join(custId_Frequency.set_index('Customer_ID'),on='Customer_ID')
In [83]:
custId_RFM = custId_RFM.join(custId_Monetary.set_index('Customer_ID'),on='Customer_ID',)
In [84]:
custId_RFM.head()
Out[84]:
Customer_ID Recency Frequency MonetaryValue
0 1800000006365760 57.0 5 1117.00
1 1800000006365820 829.0 1 260.00
2 1800000006366060 84.0 7 607.00
3 1800000006366230 116.0 6 643.80
4 1800000006366570 81.0 11 1285.01
In [85]:
custId_RFM['RecencyRank'] = pd.qcut(custId_RFM.Recency,q = 5, labels = False)
In [86]:
custId_RFM['FrequencyRank'] = pd.qcut(custId_RFM.Frequency,q = 5, labels = False, duplicates = 'drop')
In [87]:
custId_RFM['MonetaryValueRank'] = pd.qcut(custId_RFM.MonetaryValue,q = 5, labels = False , duplicates = 'drop')
In [88]:
custId_RFM.shape
Out[88]:
(100000, 7)
In [89]:
custId_RFM.head()
Out[89]:
Customer_ID Recency Frequency MonetaryValue RecencyRank FrequencyRank MonetaryValueRank
0 1800000006365760 57.0 5 1117.00 0.0 1 4
1 1800000006365820 829.0 1 260.00 4.0 0 2
2 1800000006366060 84.0 7 607.00 1.0 2 3
3 1800000006366230 116.0 6 643.80 2.0 2 3
4 1800000006366570 81.0 11 1285.01 1.0 3 4
In [90]:
custId_RFM.isnull().sum()
Out[90]:
Customer_ID          0
Recency              6
Frequency            0
MonetaryValue        0
RecencyRank          6
FrequencyRank        0
MonetaryValueRank    0
dtype: int64
We do not have data for the 6 records in demographics table, dropping them
In [91]:
custId_RFM.dropna(inplace=True)
In [92]:
custId_RFM.RecencyRank.unique()
Out[92]:
array([0., 4., 1., 2., 3.])
In [93]:
for i in custId_RFM.columns:
    if 'Rank' in i:
        print (i)
        custId_RFM[i] = custId_RFM[i].apply(lambda x : str(int(np.round(x)+1)))
RecencyRank
FrequencyRank
MonetaryValueRank
In [94]:
custId_RFM.RecencyRank.unique()
Out[94]:
array(['1', '5', '2', '3', '4'], dtype=object)
In [95]:
custId_RFM.head()
Out[95]:
Customer_ID Recency Frequency MonetaryValue RecencyRank FrequencyRank MonetaryValueRank
0 1800000006365760 57.0 5 1117.00 1 2 5
1 1800000006365820 829.0 1 260.00 5 1 3
2 1800000006366060 84.0 7 607.00 2 3 4
3 1800000006366230 116.0 6 643.80 3 3 4
4 1800000006366570 81.0 11 1285.01 2 4 5
In [96]:
custId_RFM['RFMScore'] = custId_RFM.RecencyRank + custId_RFM.FrequencyRank + custId_RFM.MonetaryValueRank
In [97]:
custId_RFM.head()
Out[97]:
Customer_ID Recency Frequency MonetaryValue RecencyRank FrequencyRank MonetaryValueRank RFMScore
0 1800000006365760 57.0 5 1117.00 1 2 5 125
1 1800000006365820 829.0 1 260.00 5 1 3 513
2 1800000006366060 84.0 7 607.00 2 3 4 234
3 1800000006366230 116.0 6 643.80 3 3 4 334
4 1800000006366570 81.0 11 1285.01 2 4 5 245
In [98]:
custId_RFM.dtypes
Out[98]:
Customer_ID            int64
Recency              float64
Frequency              int64
MonetaryValue        float64
RecencyRank           object
FrequencyRank         object
MonetaryValueRank     object
RFMScore              object
dtype: object
In [99]:
custId_RFM.to_csv(datapath + "RFM.csv",index = False)
In [100]:
custId_RFM.RFMScore.value_counts()
Out[100]:
511    6711
145    5464
411    4390
512    4053
245    3650
311    3109
412    2893
345    2749
211    2422
312    2323
111    2266
423    2074
212    1971
323    1947
112    1846
513    1839
123    1832
134    1814
223    1746
445    1634
334    1582
234    1563
422    1502
523    1464
144    1418
413    1397
124    1393
324    1322
224    1306
434    1289
       ... 
321     236
221     203
143     182
121     175
432     157
443     147
243     146
515     143
343     138
132     132
332     129
232     103
532     103
215      90
415      85
543      72
315      70
115      57
142      11
542       8
431       7
242       7
231       7
531       6
131       5
442       3
331       2
342       2
341       1
541       1
Name: RFMScore, Length: 97, dtype: int64

Customer Segmentation

Segments

  • Best Customers : Recent, frequent and high monetary value
  • High-spending New Customers : Recent , not so frequent but high spending
  • Lowest-Spending Active Loyal Customers : recent and frequent but spend less
  • Churned Best Customers : Not recent but frequent and high spending
In [101]:
segments_dict = {'155':'Best_Customers',
                 '145':'Best_Customers',
                 '154':'Best_Customers',  
                 '244':'Best_Customers',
                 '254':'Best_Customers',
                 '245':'Best_Customers',
                 
                 '135': 'High_Spending_New_Customers',
                 '235': 'High_Spending_New_Customers',
                 '134': 'High_Spending_New_Customers',
                 '125': 'High_Spending_New_Customers',
                 '124': 'High_Spending_New_Customers',
                 '225': 'High_Spending_New_Customers',
                
                 '151': 'Lowest_Spending_Active_Loyal_Customers',
                 '152': 'Lowest_Spending_Active_Loyal_Customers',
                 '153': 'Lowest_Spending_Active_Loyal_Customers',
                 '141': 'Lowest_Spending_Active_Loyal_Customers',
                 '142': 'Lowest_Spending_Active_Loyal_Customers',
                 '143': 'Lowest_Spending_Active_Loyal_Customers',
                 '251': 'Lowest_Spending_Active_Loyal_Customers',
                 '252': 'Lowest_Spending_Active_Loyal_Customers',
                 '253': 'Lowest_Spending_Active_Loyal_Customers',
                 
                 '515': 'Curned_Best_Customers',
                 '514': 'Curned_Best_Customers',
                 '525': 'Curned_Best_Customers',
                 '524': 'Curned_Best_Customers',
                 '415': 'Curned_Best_Customers',
                 '414': 'Curned_Best_Customers',
                 
                }

We can add mote segments, but for this analysis I am limiting the number of segments

In [102]:
custId_RFM['CustomerSegment'] = custId_RFM.RFMScore
In [103]:
custId_RFM['CustomerSegment'] = custId_RFM['CustomerSegment'].replace(segments_dict)

3-D plotting of Customer Segments

In [104]:
## Best_Customers
Best_Customers = go.Scatter3d(
    x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Best_Customers'],
    y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Best_Customers'],
    z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Best_Customers'],
    mode='markers',name ='Best_Customers',
    marker=dict(
        size=4,
        opacity=0.6
    )
)

## High_Spending_New_Customers
High_Spending_New_Customers = go.Scatter3d(
    x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
    y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
    z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
    mode='markers',name ='High_Spending_New_Customers',
    marker=dict(
        size=4,
        opacity=0.6
    )
)

## Lowest_Spending_Active_Loyal_Customers
Lowest_Spending_Active_Loyal_Customers = go.Scatter3d(
    x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
    y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
    z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
    mode='markers',name ='Lowest_Spending_Active_Loyal_Customers',
    marker=dict(
        size=4,
        opacity=0.6
    )
)
## Curned_Best_Customers
Curned_Best_Customers = go.Scatter3d(
    x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
    y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
    z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
    mode='markers',name ='Curned_Best_Customers',
    marker=dict(
        size=4,
        opacity=0.6
    )
)


data = [Best_Customers,High_Spending_New_Customers, Lowest_Spending_Active_Loyal_Customers ,Curned_Best_Customers]
layout = go.Layout(
                    scene = dict(
                    xaxis = dict(
                        title='Recency',
                        backgroundcolor="black",
                        showbackground=True,
                        titlefont=dict(
                                        size=16,
                                        color='black'
                                        )
                        
                        
                    ),
                    yaxis = dict(
                        title='Frequency',
                        showbackground=True,
                        backgroundcolor="black",
                        titlefont=dict(
                                        size=16,
                                        color='black'
                                        )
                    ),
                    zaxis = dict(
                        title='MonetaryValue',
                        backgroundcolor="black",
                        showbackground=True,
                        titlefont=dict(
                                        size=16,
                                        color='black'
                                        )
                    )
                    ),
                        width=1000, # height of the figure in pixels
                        height=800, # height of the figure in pixels
    margin = dict( b =15),)
fig = go.Figure(data=data, layout=layout)

fig['layout'].update(title= "RFM Customer Segmentation")
iplot(fig)

Observations:

  • Churned best customers had a maximum monetary value of 6000 dhirams, they did not shop in the last 1.5 year.
  • Best customers have shopped in the last 3 months and on an average shopped 50 times.
    • Giving offers/discounts to these customers will be helpful in converting them to long term high valued customers.
  • High spending new customers have a maximum frequency of 9.
  • Lowest spending active loyal customers have visited less than 10 times in the last 2.5 months and spent a maximum of 500 dhirams.
    • These low spendingg loyal customers can be converted to high spending customers if the company can offer discounts on the products.

Thank you for reading!